TxDOT Project


In [1]:
import os
import math
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

import csv
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import feature_selection, linear_model

Load in the Data


In [16]:
df = pd.read_csv('data/Bid_Data.csv')
len(df)


C:\Users\Collin\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2717: DtypeWarning: Columns (27,32) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[16]:
32447

Formatting & Identifying the Data


In [18]:
df = df[df['Rank'] == 1]
df = df[df['Type'] == 'Construction']

df = df.drop(' From', 1)
df = df.drop('To', 1)
df = df.drop('Contract Description', 1)
df = df.drop('Contractor', 1)
df = df.drop('Contract Category', 1)
df = df.drop('LNG MON', 1)
df = df.drop('MONTH', 1)

df['Award Amount'] = df['Award Amount'].str.lstrip('$')
df['Engineers Estimate'] = df['Engineers Estimate'].str.lstrip('$')
df['Award Amount'] = df['Award Amount'].str.replace(',','').astype(float)
df['Engineers Estimate'] = df['Engineers Estimate'].str.replace(',','').astype(float)

#Renaming Variables
df['EngEst'] = df['Engineers Estimate']
df['NBidders'] = df['Number of Bidders']
df['Date'] = pd.to_datetime(df['Letting Date'])
df.set_index('Date' , inplace=True)
df['Year'] = df.index.year
df['Month'] = df.index.month
df['WinBid'] = df['Award Amount']

# Creating New Varialbes
df['Diff'] = df['EngEst'] - df['WinBid']
df['lnWinBid'] = np.log(df['WinBid'])
df['lnEngEst'] = np.log(df['EngEst'])
df['DiffLn'] = df['lnWinBid'] - df['lnEngEst']
df['Within10Percent'] = 1 
df['PercentOff'] = df['Diff'] / df['EngEst']
df['MoreOrLessThan10'] = 0
df['LessThan10'] = 0
df['MoreThan10'] = 0

df.loc[(df.PercentOff > .10) , 'Within10Percent'] = 0
df.loc[(df.PercentOff < -.10) , 'Within10Percent'] = 0
df.loc[(df.PercentOff > .10) , 'MoreOrLessThan10'] = 1
df.loc[(df.PercentOff < -.10) , 'MoreOrLessThan10'] = 2
df.loc[(df.PercentOff > .10) , 'MoreThan10'] = 1
df.loc[(df.PercentOff < -.10) , 'LessThan10'] = 1

print(len(df))


5177

In [15]:
df


Out[15]:
Type CCSJ Letting Call Number Project Length Highway Letting Date District County Number of Bidders ... WinBid Diff lnWinBid lnEngEst DiffLn Within10Percent PercentOff MoreOrLessThan10 LessThan10 MoreThan10
Date
2010-10-21 Construction 0001-05-016 10103012 STP 2009(816)ES 0.200 FM 259 10/21/2010 El Paso El Paso 1 ... 116285.82 7730.08 11.663806 11.728165 -0.064359 1 0.062331 0 0 0
2010-08-11 Construction 0002-01-083 8103285 STP 2011(321) 9.050 SH 20 8/11/2010 El Paso El Paso 6 ... 2477913.10 1365614.90 14.722927 15.161901 -0.438974 0 0.355302 1 0 1
2010-04-07 Construction 0002-08-050 4103999 C 2-8-50 1.250 IH 10 4/7/2010 El Paso Hudspeth 1 ... 228662.50 37724.88 12.340002 12.492707 -0.152704 0 0.141617 1 0 1
2010-08-11 Construction 0002-11-056 8103286 IM 0101(256) 4.316 IH 10 8/11/2010 El Paso Culberson 2 ... 1094961.15 492577.51 13.906229 14.277695 -0.371466 0 0.310277 1 0 1
2010-05-12 Construction 0002-14-033 5103213 STP 2010(756)ES 0.100 FM 258 5/12/2010 El Paso El Paso 4 ... 149069.50 109807.00 11.912168 12.464106 -0.551938 0 0.424168 1 0 1
2010-02-09 Construction 0003-01-051 2103035 IM 0101(254) 1.069 IH 10 2/9/2010 El Paso Culberson 2 ... 409447.31 -66821.41 12.922564 12.744394 0.178169 0 -0.195027 2 1 0
2010-05-12 Construction 0003-05-046 5103208 IM 0201(176) 3.806 IH 20 5/12/2010 Odessa Reeves 4 ... 2284464.17 550755.83 14.641642 14.857630 -0.215988 0 0.194255 1 0 1
2010-02-10 Construction 0003-05-048 2103209 IM 0201(175) 57.050 IH 20 2/10/2010 Odessa Reeves 3 ... 3460027.14 193621.95 15.056787 15.111237 -0.054450 1 0.052994 0 0 0
2010-08-11 Construction 0004-04-076 8103305 STP 2000(400)TE 1.181 IH 20 8/11/2010 Odessa Ward 8 ... 14473728.60 -1989505.68 16.487846 16.339976 0.147870 0 -0.159362 2 1 0
2010-12-08 Construction 0005-05-105 12103231 IM 0202(231) 78.025 IH 20 12/8/2010 Abilene Howard 5 ... 2555217.75 66828.32 14.753648 14.779466 -0.025818 1 0.025487 0 0 0
2010-10-22 Construction 0005-07-049 10103227 STP 2011(623)ES 18.838 IH 20 10/22/2010 Abilene Mitchell 6 ... 7827281.73 1083350.06 15.873126 16.002756 -0.129630 0 0.121579 1 0 1
2010-03-09 Construction 0005-08-097 3103012 STP 2010(601)HES 0.001 IH 20 3/9/2010 Abilene Mitchell 4 ... 85238.00 51205.42 11.353203 11.823665 -0.470463 0 0.375287 1 0 1
2010-08-10 Construction 0005-13-053 8103084 IM 0201(177) 9.164 IH 20 8/10/2010 Odessa Ector 1 ... 6438932.93 -639101.05 15.677873 15.573339 0.104534 0 -0.110193 2 1 0
2010-04-06 Construction 0006-01-091 4103008 IM 0202(230) 9.040 IH 20 4/6/2010 Abilene Mitchell 6 ... 3272602.41 -254919.33 15.001096 14.920000 0.081096 1 -0.084475 0 0 0
2010-08-11 Construction 0006-02-104 8103239 C 6-2-104 8.284 IH 20 8/11/2010 Abilene Nolan 6 ... 1038393.49 82349.54 13.853185 13.929502 -0.076317 1 0.073478 0 0 0
2010-08-10 Construction 0006-02-106 8103085 STP 2011(292)HES 9.889 IH 20 8/10/2010 Abilene Nolan 8 ... 589049.95 92165.21 13.286266 13.431633 -0.145367 0 0.135295 1 0 1
2010-01-05 Construction 0006-04-068 1103023 STP 2010(233)SB 17.488 IH 20 1/5/2010 Abilene Taylor 14 ... 1191750.29 126077.35 13.990934 14.091495 -0.100562 1 0.095671 0 0 0
2010-04-07 Construction 0006-05-108 4103213 STP 2010(559)ES 2.986 IH 20 4/7/2010 Abilene Taylor 3 ... 548674.30 36426.05 13.215260 13.279539 -0.064278 1 0.062256 0 0 0
2010-05-12 Construction 0006-18-052 5103221 STP 2010(845) 3.283 BI 20-R 5/12/2010 Abilene Taylor 5 ... 595110.27 84426.13 13.296502 13.429166 -0.132664 0 0.124241 1 0 1
2010-03-10 Construction 0007-02-047 3103218 STP 2010(602)ES 8.914 IH 20 3/10/2010 Abilene Callahan 7 ... 2294673.03 301437.12 14.646101 14.769525 -0.123424 0 0.116111 1 0 1
2010-01-05 Construction 0007-03-083 1103006 IM 0203(077) 308.483 IH 20 1/5/2010 Brownwood Eastland 4 ... 6349325.82 408080.75 15.663859 15.726150 -0.062291 1 0.060390 0 0 0
2010-02-09 Construction 0007-04-109 2103024 STP 2010(322)HES 7.724 IH 20 2/9/2010 Brownwood Eastland 13 ... 515542.26 393293.65 13.152975 13.719920 -0.566945 0 0.432744 1 0 1
2010-03-10 Construction 0007-06-076 3103217 IM 0203(078) 5.594 IH 20 3/10/2010 Brownwood Eastland 10 ... 2803289.11 120462.21 14.846304 14.888378 -0.042074 1 0.041201 0 0 0
2010-06-03 Construction 0007-06-081 6103009 IM 0203(079) 2.244 IH 20 6/3/2010 Brownwood Eastland 4 ... 1554812.14 -141855.95 14.256865 14.161195 0.095671 0 -0.100397 2 1 0
2010-08-10 Construction 0007-10-054 8103032 STP 2011(203) 7.380 US 180 8/10/2010 Fort Worth Palo Pinto 9 ... 1517352.16 296081.24 14.232477 14.410733 -0.178255 0 0.163271 1 0 1
2010-10-21 Construction 0008-02-070 10103013 STP 2011(472) 179.385 US 180 10/21/2010 Fort Worth Parker 5 ... 7357575.84 1156674.43 15.811241 15.957252 -0.146011 0 0.135852 1 0 1
2010-04-06 Construction 0008-03-098 4103032 IM 0204(271) 5.170 IH 20 4/6/2010 Fort Worth Parker 6 ... 1730019.68 140955.22 14.363643 14.441970 -0.078327 1 0.075338 0 0 0
2010-08-10 Construction 0008-06-048 8103035 BR 2011(098) 0.169 SH 180 8/10/2010 Fort Worth Tarrant 5 ... 1447722.05 48917.86 14.185502 14.218733 -0.033231 1 0.032685 0 0 0
2010-12-07 Construction 0008-13-223 12103027 IM 8204(276) 2.341 IH 820 12/7/2010 Fort Worth Tarrant 4 ... 1243621.79 317799.96 14.033538 14.261107 -0.227569 0 0.203532 1 0 1
2010-09-08 Construction 0008-15-044 9103021 IM 8204(273) 0.100 IH 820 9/8/2010 Fort Worth Tarrant 5 ... 96000.00 31793.18 11.472103 11.758168 -0.286065 0 0.248786 1 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2016-03-03 Construction 1186-01-089 3163028 STP 2016(636) 4.103 FM 969 3/3/2016 Austin Travis 6 ... 2067551.68 284564.36 14.541876 14.670826 -0.128950 0 0.120982 1 0 1
2016-03-04 Construction 1259-01-041 3163228 STP 2016(812)HES 1.336 FM 1097 3/4/2016 Houston Montgomery 3 ... 238826.40 -41172.80 12.383492 12.194271 0.189221 0 -0.208308 2 1 0
2016-02-10 Construction 1277-01-015 2163228 BR 2016(411) 0.318 FM 1012 2/10/2016 Beaumont Newton 3 ... 1777740.74 -130944.23 14.390854 14.314342 0.076511 1 -0.079515 0 0 0
2016-04-06 Construction 1289-01-029 4163237 BR 1602(046) 3.551 FM 1126 4/6/2016 Dallas Navarro 8 ... 6714487.74 268564.38 15.719778 15.758997 -0.039219 1 0.038459 0 0 0
2016-01-06 Construction 1397-01-031 1163240 C 1397-1-31 3.094 FM 1836 1/6/2016 Dallas Kaufman 5 ... 5943147.30 -906869.64 15.597749 15.432178 0.165572 0 -0.180067 2 1 0
2016-01-06 Construction 1402-03-011 1163242 STP 2016(732) 20.298 FM 1375 1/6/2016 Houston Montgomery 4 ... 1297509.45 352076.70 14.075957 14.316035 -0.240078 0 0.213433 1 0 1
2016-04-06 Construction 1494-03-001 4163222 C 1494-3-1 0.018 FM 3486 4/6/2016 Dallas Kaufman 5 ... 4849821.37 -215531.22 15.394452 15.348994 0.045459 1 -0.046508 0 0 0
2016-02-10 Construction 1526-03-015 2163233 STP 2016(678) 1.865 FM 1606 2/10/2016 Abilene Scurry 5 ... 6876410.33 1435554.66 15.743607 15.933207 -0.189599 0 0.172709 1 0 1
2016-02-09 Construction 1531-02-012 2163022 STP 2016(679) 0.001 FM 1610 2/9/2016 Abilene Scurry 7 ... 5397845.79 369148.18 15.501511 15.567662 -0.066151 1 0.064011 0 0 0
2016-02-09 Construction 1606-02-019 2163020 STP 2014(884) 0.062 FM 2123 2/9/2016 Fort Worth Wise 6 ... 2779919.35 676795.35 14.837932 15.055829 -0.217897 0 0.195791 1 0 1
2016-02-10 Construction 1685-03-088 2163237 STP 2016(731) 1.762 FM 1960 2/10/2016 Houston Harris 3 ... 6636000.51 113980.17 15.708020 15.725050 -0.017030 1 0.016886 0 0 0
2016-01-05 Construction 1685-03-096 1163024 STP 2016(474) 2.088 FM 1960 1/5/2016 Houston Harris 3 ... 191207.77 81159.98 12.161116 12.514908 -0.353793 0 0.297979 1 0 1
2016-03-04 Construction 2003-01-008 3163217 STP 2016(839)HES 3.594 FM 2122 3/4/2016 Paris Lamar 6 ... 763854.27 -9884.53 13.546132 13.533108 0.013025 1 -0.013110 0 0 0
2016-01-05 Construction 2079-01-043 1163037 STP 2013(901) 2.879 FM 1220 1/5/2016 Fort Worth Tarrant 2 ... 8350775.03 -2311318.34 15.937865 15.613825 0.324040 0 -0.382703 2 1 0
2016-03-04 Construction 2263-03-041 3163212 C 2263-3-41 5.830 SH 361 3/4/2016 Corpus Christi Nueces 4 ... 5552811.60 302405.80 15.529815 15.582844 -0.053029 1 0.051647 0 0 0
2016-01-06 Construction 2266-02-138 1163205 NH 2016(594) 0.056 SH 360 1/6/2016 Fort Worth Tarrant 2 ... 2126639.56 -165737.61 14.570054 14.488915 0.081139 1 -0.084521 0 0 0
2016-01-05 Construction 2296-01-051 1163035 NH 2016(565) 7.055 SH 191 1/5/2016 Odessa Ector 3 ... 4387311.96 -338356.71 15.294227 15.213969 0.080258 1 -0.083566 0 0 0
2016-01-05 Construction 2560-01-076 1163033 STP 2016(425) 3.901 SL 224 1/5/2016 Lufkin Nacogdoches 3 ... 789787.24 12523.33 13.579519 13.595251 -0.015732 1 0.015609 0 0 0
2016-04-06 Construction 2607-01-011 4163205 STP 2016(962) 1.492 SH 188 4/6/2016 Corpus Christi Aransas 2 ... 1142011.55 -117358.92 13.948302 13.839864 0.108438 0 -0.114535 2 1 0
2016-04-05 Construction 2754-01-008 4163012 STP 2016(370) 3.626 FM 2705 4/5/2016 Waco Limestone 3 ... 4733942.79 -79712.79 15.370269 15.353287 0.016982 1 -0.017127 0 0 0
2016-04-06 Construction 2887-01-012 4163206 STP 2016(958)HES 4.942 FM 2830 4/6/2016 Beaumont Liberty 5 ... 999523.59 168362.80 13.815034 13.970706 -0.155672 0 0.144160 1 0 1
2016-02-10 Construction 3050-02-024 2163202 STP 2016(119)MM 0.001 FM 2978 2/10/2016 Houston Montgomery 5 ... 15788778.28 612127.00 16.574810 16.612847 -0.038037 1 0.037323 0 0 0
2016-02-10 Construction 3136-01-177 2163239 STP 2016(637) 0.054 SL 1 2/10/2016 Austin Travis 4 ... 787593.55 94266.36 13.576737 13.689788 -0.113051 0 0.106895 1 0 1
2016-02-09 Construction 3256-02-091 2163032 NH 2016(728) 0.026 SL 8 2/9/2016 Houston Harris 3 ... 596804.70 -98636.20 13.299345 13.118694 0.180652 0 -0.197998 2 1 0
2016-04-05 Construction 3271-01-011 4163009 STP 2016(959) 0.016 FM 3180 4/5/2016 Beaumont Chambers 4 ... 6439873.51 -1090631.57 15.678019 15.492465 0.185554 0 -0.203885 2 1 0
2016-03-03 Construction 3277-01-022 3163005 STP 2014(116)TE 30.194 FM 3177 3/3/2016 Austin Travis 5 ... 2045353.10 67623.37 14.531081 14.563608 -0.032527 1 0.032004 0 0 0
2016-01-05 Construction 3348-01-019 1163018 STP 1502(549)HES 0.001 FM 3136 1/5/2016 Fort Worth Johnson 2 ... 6829235.17 -2143649.77 15.736723 15.360001 0.376722 0 -0.457499 2 1 0
2016-02-10 Construction 3417-01-028 2163210 CC 3417-1-28 3.834 FM 734 2/10/2016 Austin Travis 4 ... 839340.91 -60727.46 13.640372 13.565270 0.075102 1 -0.077994 0 0 0
2016-04-05 Construction 3538-01-041 4163027 STP 2016(963) 0.030 SH 242 4/5/2016 Houston Montgomery 2 ... 3635039.14 841444.90 15.106130 15.314348 -0.208218 0 0.187970 1 0 1
2016-02-09 Construction 3538-01-044 2163036 STP 2016(722)HES 3.529 SH 242 2/9/2016 Houston Montgomery 3 ... 275772.88 9539.89 12.527333 12.561341 -0.034008 1 0.033437 0 0 0

5177 rows × 43 columns

Exploring the Data

We see that Bids are Log Normally Distributed


In [26]:
sns.jointplot(x="EngEst", y="WinBid", data=df, kind="reg"); sns.jointplot(x="lnEngEst", y="lnWinBid", data=df, kind="reg");


C:\Users\Collin\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

In [20]:
cmap = {'0': 'g', '1': 'r', '2': 'b' }
df['cMoreOrLessThan10'] = df.MoreOrLessThan10.apply(lambda x: cmap[str(x)])
print (df.plot('lnEngEst', 'lnWinBid', kind='scatter', c=df.cMoreOrLessThan10))


Axes(0.125,0.125;0.775x0.755)

In [24]:
df_test = df[(df.Year == 2016) & (df.Month == 4)]
print(len(df_test) , 'projects in April 2016')

df_train = df[(df.Year != 2016) | (df.Month != 4)]
print(len(df_train) ,'projects from Jan 2010 to April 2016')

#df_train[['Year','Month']].tail()


67 projects in April 2016
5110 projects from Jan 2010 to April 2016

In [28]:
#Using ALL the Data

Percent = float(df.Within10Percent.sum()) / len(df)
print(round((Percent)*100,2) , '% of All the TxDOT estimates were within 10% of actual bid')

Percent_April_2016 = float(df[(df.Year == 2016) & (df.Month == 4)].Within10Percent.sum()) / len(df_test)
print (round((Percent_April_2016)*100,2) , '% of the April 2016 TxDOT estimates were within 10% of actual bid')


50.34 % of All the TxDOT estimates were within 10% of actual bid
46.27 % of the April 2016 TxDOT estimates were within 10% of actual bid

In [31]:
names_X = ['Length','NBidders','Year','Month','lnEngEst','Time']

def X_y(df):
    X = df[ names_X ]
    y_more = df['MoreThan10']
    y_less =df['LessThan10']
    return X, y_more, y_less

train_X, train_y_more, train_y_less = X_y(df_train)
test_X, test_y_more, test_y_less = X_y(df_test)

print(len(train_y_more))
print(len(train_y_less))
print(len(test_y_more))
print(len(test_y_less))


5110
5110
67
67

In [32]:
test_X.head()


Out[32]:
Length NBidders Year Month lnEngEst Time
Date
2016-04-05 15.055 2 2016 4 14.979827 0.0
2016-04-06 0.016 5 2016 4 14.722602 0.0
2016-04-06 2.024 4 2016 4 16.836501 0.0
2016-04-06 6.294 3 2016 4 14.893941 0.0
2016-04-06 0.455 3 2016 4 13.837735 0.0

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
def RFC_model(X, y):
    """ Performs grid search over the 'n_estimators' parameter for a 
        random forest regressor trained on the input data [X, y]. """
    
    # Create cross-validation sets from the training data
    cv_sets = KFold(n_splits = 30)
    cv_sets.split(X,y)

    # Create a decision tree regressor object
    clf = RandomForestClassifier()

    # Create a dictionary for the parameter 'max_depth' with a range from 1 to 100
    params = {'n_estimators':range(1,len(X.columns))}

    # Transform 'performance_metric' into a scoring function using 'make_scorer' 
    #scoring_fnc = make_scorer(performance_metric)

    # Create the grid search object
    grid = GridSearchCV(clf, params, cv=cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid.best_estimator_

In [46]:
model_1 = RFC_model(train_X,train_y_more)

In [47]:
print('correct training classification = ', model_1.score(train_X, train_y_more))
print ('correct testing classification = ', model_1.score(test_X, test_y_more))


correct training classification =  0.929549902153
correct testing classification =  0.641791044776

In [ ]: